| Method | KNN(k=5) | LDA | QDA | LR | RF(tuning parameter: mtry=3; ntrees=500) | SVM(tuning parameter: cost=10; gamma=1; sigma = 8.691262; C = 1) |
|---|---|---|---|---|---|---|
| Accuracy | 99.6% | 98.3% | 99.4% | 99.5% | 99.7% | 99.7% |
| AUC | 99.8% | 98.7% | 99.8% | 99.8% | 99.0% | 99.4% |
| ROC | in tabs | in tabs | in tabs | in tabs | in tabs | in tabs |
| Threshold | .66 | .85 | .40 | .29 | .50 | .52 |
| Sensitivity | 93.1% | 74.0% | 85.1% | 91.0% | 95.2% | 94.0% |
| Specificity | 99.9% | 99.2% | 99.6% | 99.8% | 99.8% | 99.9% |
| FDR | 2.2% | 24.6% | 1.7% | 4.7% | 4.8% | .028% |
| Precision | 97.8% | 75.4% | 98.3% | 95.3% | 95.2% | 97.2% |
library(tidyverse)
library(dplyr)
library(caret)
library(class)
library(yardstick)
library(plotly)
library(boot)
library(pROC)
library(glmnet)
library(purrr)
library(gridExtra)
library(randomForest)
library(e1071)
install.packages("kernlab")
#reading in the data
data <- read.csv("HaitiPixels.csv", header=TRUE ,sep=",")
data <- data %>%
mutate(BlueClass = as.factor(ifelse(Class=="Blue Tarp","Yes", "No")))
#check the levels just specified
levels(data$BlueClass)
## [1] "No" "Yes"
#set data var to be columns 2-5 of the set
data = data[c(2:5)]
data <- data %>% mutate(id = row_number())
#check addition
head(data$id)
## [1] 1 2 3 4 5 6
#shuffle data to fairly split into test / train
shuffleddata = sample_n(data, nrow(data))
#check that it has been shuffled
head(shuffleddata$id) #different then the first six lines of the csv file
## [1] 52330 20209 39644 38201 16443 12272
#remove the id column
shuffleddata = shuffleddata[c(1:4)]
#split the data into test and train for use in our upcoming models
#using a 10k subset for faster knn function execution and file freezing issues
samp <- 1:10000
samp2 <- 10001:20000
train<-shuffleddata[samp,]
test <- shuffleddata[samp2,]
head(train)
head(test)
#model training rules for all models
train_control <- caret::trainControl(method="cv", number=10, returnResamp='all', classProbs=TRUE, savePredictions='final')
#KNN model
system.time({
knnmod=train(BlueClass~Red+Green+Blue,data=train,trControl=train_control,method="knn",preProcess = c("center","scale"), tuneGrid = expand.grid(k = c(1:15)))
})
## user system elapsed
## 20.873 0.105 21.140
knnmod
## k-Nearest Neighbors
##
## 10000 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9001, 9000, 8999, 9000, 9001, 8999, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.9966007 0.9477581
## 2 0.9961002 0.9402912
## 3 0.9960004 0.9383623
## 4 0.9963002 0.9430485
## 5 0.9963006 0.9433100
## 6 0.9962001 0.9413603
## 7 0.9962001 0.9406740
## 8 0.9962998 0.9425493
## 9 0.9958995 0.9357883
## 10 0.9957995 0.9343295
## 11 0.9960996 0.9392304
## 12 0.9958998 0.9360504
## 13 0.9957000 0.9328276
## 14 0.9957998 0.9345563
## 15 0.9954998 0.9297910
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 1.
#plot KNN model
plot(knnmod)
#set prediction, probability, and cv score variables in case needed
knnmod_pred <- predict(knnmod, test,'raw')
knnmod_prob <- predict(knnmod, test,'prob')
knnmod_scored <- cbind(test, knnmod_pred, knnmod_prob)
#AUC/ROC
options(yardstick.event_first=FALSE)
#area under the curve
knn_auc = knnmod_prob %>%
yardstick::roc_auc(truth=test$BlueClass, Yes)
## Warning: The `yardstick.event_first` option has been deprecated as of yardstick 0.0.7 and will be completely ignored in a future version.
## Instead, set the following argument directly in the metric function:
## `options(yardstick.event_first = TRUE)` -> `event_level = 'first'` (the default)
## `options(yardstick.event_first = FALSE)` -> `event_level = 'second'`
## This warning is displayed once per session.
knn_auc
#ROC curve + plot
ROC_curve<-knnmod_prob %>%
yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
ROC_curve_plot <- ROC_curve %>%
ggplot(aes(x=one_minus_specificity,y=sensitivity))+
geom_line() + geom_point() +
geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
xlab("one_minus_specificity\n(false positive rate)")+
ggtitle('KNN ROC curve')
ggplotly(ROC_curve_plot)
#set threshold
knnmod_pred2 <- knnmod$pred %>%
#the accuracy doesn't improve by reducing the threshold any further than .66, 99.7% best.
mutate(prediction = ifelse(Yes>.66, 'Yes', 'No')) %>%
mutate(prediction = factor(prediction, levels=c('No','Yes')))
#confusion matrix
confusionMatrix(knnmod_pred2$prediction, knnmod_pred2$obs, positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9648 16
## Yes 18 318
##
## Accuracy : 0.9966
## 95% CI : (0.9953, 0.9976)
## No Information Rate : 0.9666
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9475
##
## Mcnemar's Test P-Value : 0.8638
##
## Sensitivity : 0.9521
## Specificity : 0.9981
## Pos Pred Value : 0.9464
## Neg Pred Value : 0.9983
## Prevalence : 0.0334
## Detection Rate : 0.0318
## Detection Prevalence : 0.0336
## Balanced Accuracy : 0.9751
##
## 'Positive' Class : Yes
##
#LDA model
system.time({
ldamod=train(BlueClass~Red+Green+Blue,data=train,trControl=train_control,method="lda",preProcess = c("center","scale"), family="binomial")
})
## user system elapsed
## 1.084 0.070 1.366
ldamod
## Linear Discriminant Analysis
##
## 10000 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9000, 9000, 9001, 9000, 9000, 9001, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9846001 0.7717723
#set prediction, probability, and cv score variables in case needed
ldamod_pred <- predict(ldamod, test,'raw')
ldamod_prob <- predict(ldamod, test,'prob')
ldamod_scored <- cbind(test, ldamod_pred, ldamod_prob)
#AUC/ROC
options(yardstick.event_first=FALSE)
#area under the curve
lda_auc = ldamod_prob %>%
yardstick::roc_auc(truth=test$BlueClass, Yes)
lda_auc
#ROC curve + plot
ROC_curve2<-ldamod_prob %>%
yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
ROC_curve_plot2 <- ROC_curve2 %>%
ggplot(aes(x=one_minus_specificity,y=sensitivity))+
geom_line() + geom_point() +
geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
xlab("one_minus_specificity\n(false positive rate)")+
ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot2)
#set new threshold
ldamod_pred2 <- ldamod$pred %>%
#the accuracy doesn't improve by reducing the threshold any further than .85, 98.6% best.
mutate(prediction = ifelse(Yes>.85, 'Yes', 'No')) %>%
mutate(prediction = factor(prediction, levels=c('No','Yes')))
#new threshold matrix
confusionMatrix(ldamod_pred2$prediction, ldamod_pred2$obs, positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9597 81
## Yes 69 253
##
## Accuracy : 0.985
## 95% CI : (0.9824, 0.9873)
## No Information Rate : 0.9666
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7636
##
## Mcnemar's Test P-Value : 0.3691
##
## Sensitivity : 0.7575
## Specificity : 0.9929
## Pos Pred Value : 0.7857
## Neg Pred Value : 0.9916
## Prevalence : 0.0334
## Detection Rate : 0.0253
## Detection Prevalence : 0.0322
## Balanced Accuracy : 0.8752
##
## 'Positive' Class : Yes
##
#QDA model
system.time({
qdamod=train(BlueClass~Red+Green+Blue,data=train,trControl=train_control,method="qda",preProcess = c("center","scale"), family="binomial")
})
## user system elapsed
## 1.064 0.002 1.083
qdamod
## Quadratic Discriminant Analysis
##
## 10000 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9000, 9000, 9000, 9000, 8999, 9001, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9945991 0.9091862
##set prediction, probability, and cv score variables in case needed
qdamod_pred <- predict(qdamod, test,'raw')
qdamod_prob <- predict(qdamod, test,'prob')
qdamod_scored <- cbind(test, qdamod_pred, qdamod_prob)
#AUC/ROC
options(yardstick.event_first=FALSE)
#area under the curve
qda_auc = qdamod_prob %>%
yardstick::roc_auc(truth=test$BlueClass, Yes)
qda_auc
#ROC curve + plot
ROC_curve3<-qdamod_prob %>%
yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
ROC_curve_plot3 <- ROC_curve3 %>%
ggplot(aes(x=one_minus_specificity,y=sensitivity))+
geom_line() + geom_point() +
geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
xlab("one_minus_specificity\n(false positive rate)")+
ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot3)
#set new threshold
qdamod_pred2 <- qdamod$pred %>%
#the accuracy doesn't improve by reducing the threshold any further than .40, 99.5% best.
mutate(prediction = ifelse(Yes>.40, 'Yes', 'No')) %>%
mutate(prediction = factor(prediction, levels=c('No','Yes')))
#new threshold matrix
confusionMatrix(qdamod_pred2$prediction, qdamod_pred2$obs, positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9661 47
## Yes 5 287
##
## Accuracy : 0.9948
## 95% CI : (0.9932, 0.9961)
## No Information Rate : 0.9666
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9143
##
## Mcnemar's Test P-Value : 1.303e-08
##
## Sensitivity : 0.8593
## Specificity : 0.9995
## Pos Pred Value : 0.9829
## Neg Pred Value : 0.9952
## Prevalence : 0.0334
## Detection Rate : 0.0287
## Detection Prevalence : 0.0292
## Balanced Accuracy : 0.9294
##
## 'Positive' Class : Yes
##
#GLM model
system.time({
glmmod=train(BlueClass~Red+Green+Blue,data=train,trControl=train_control,method="glm",preProcess = c("center","scale"), family="binomial")
})
## user system elapsed
## 1.848 0.014 1.901
glmmod
## Generalized Linear Model
##
## 10000 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9001, 8999, 9000, 9000, 9000, 8999, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9952998 0.924728
##et prediction, probability, and cv score variables in case needed
glmmod_pred <- predict(glmmod, test,'raw')
glmmod_prob <- predict(glmmod, test,'prob')
glmmod_scored <- cbind(test, glmmod_pred, glmmod_prob)
#AUC/ROC
options(yardstick.event_first=FALSE)
#area under the curve
qda_auc = glmmod_prob %>%
yardstick::roc_auc(truth=test$BlueClass, Yes)
qda_auc
#ROC curve + plot
ROC_curve4<-glmmod_prob %>%
yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
ROC_curve_plot4 <- ROC_curve4 %>%
ggplot(aes(x=one_minus_specificity,y=sensitivity))+
geom_line() + geom_point() +
geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
xlab("one_minus_specificity\n(false positive rate)")+
ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot4)
#set new threshold
glmmod_pred2 <- glmmod$pred %>%
#the accuracy doesn't improve by reducing the threshold any further than .29, 99.6% best.
mutate(prediction = ifelse(Yes>.29, 'Yes', 'No')) %>%
mutate(prediction = factor(prediction, levels=c('No','Yes')))
#new threshold matrix
confusionMatrix(glmmod_pred2$prediction, glmmod_pred2$obs, positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9652 25
## Yes 14 309
##
## Accuracy : 0.9961
## 95% CI : (0.9947, 0.9972)
## No Information Rate : 0.9666
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9386
##
## Mcnemar's Test P-Value : 0.1093
##
## Sensitivity : 0.9251
## Specificity : 0.9986
## Pos Pred Value : 0.9567
## Neg Pred Value : 0.9974
## Prevalence : 0.0334
## Detection Rate : 0.0309
## Detection Prevalence : 0.0323
## Balanced Accuracy : 0.9619
##
## 'Positive' Class : Yes
##
#Random Forest model
#Choosing tuning parameters:
#https://discuss.analyticsvidhya.com/t/how-to-decide-no-of-ntrees-in-randomforest/6882/3
#https://rpubs.com/phamdinhkhanh/389752
#Create control function for training with 10 folds and keep 3 folds for training.
train_control <- caret::trainControl(method="cv", number=10, returnResamp='all', classProbs=TRUE, savePredictions='final')
#https://stackoverflow.com/questions/10085806/extracting-specific-columns-from-a-data-frame
df<- train %>%
select(Red,Green,Blue)
#mtryStart defaults at sqrt(p)
#my available threshold for mtry values is pretty low based on the size of my dataset
(tuneRF(df,train$BlueClass,mtry = 5, ntree = 500, stepFactor=5, improve=0.05,
trace=TRUE, plot=TRUE, doBest=TRUE))
## Warning in randomForest.default(x, y, mtry = mtryStart, ntree = ntreeTry, :
## invalid mtry: reset to within valid range
## mtry = 5 OOB error = 0.38%
## Searching left ...
## mtry = 1 OOB error = 0.32%
## 0.1578947 0.05
## Searching right ...
## mtry = 3 OOB error = 0.38%
## -0.1875 0.05
##
## Call:
## randomForest(x = x, y = y, mtry = res[which.min(res[, 2]), 1])
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 0.35%
## Confusion matrix:
## No Yes class.error
## No 9656 10 0.001034554
## Yes 25 309 0.074850299
#mtry = 3
##------
tunegrid <- expand.grid(.mtry = 3)
modellist <- list()
#train with different ntree parameters and inspect bias/variance tradeoff
#findtrees1 <- train(BlueClass~Red+Green+Blue,
# data=train,
# method = 'rf',
# metric = 'Accuracy',
# tuneGrid = tunegrid,
# trControl = control,
# ntree = 50)
#findtrees1
#findtrees2 <- train(BlueClass~Red+Green+Blue,
# data=train,
# method = 'rf',
# metric = 'Accuracy',
# tuneGrid = tunegrid,
# trControl = control,
# ntree = 100)
#findtrees2
system.time({
RF <- train(BlueClass~Red+Green+Blue,
data=train,
method = 'rf',
metric = 'Accuracy',
tuneGrid = tunegrid,
trControl = train_control,
ntree = 500)
})
## user system elapsed
## 17.444 1.381 30.799
RF
## Random Forest
##
## 10000 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 9000, 9000, 9000, 8999, 9000, 9000, ...
## Resampling results:
##
## Accuracy Kappa
## 0.9959004 0.9354476
##
## Tuning parameter 'mtry' was held constant at a value of 3
##et prediction, probability, and cv score variables in case needed
rfmod_pred <- predict(RF, test,'raw')
rfmod_prob <- predict(RF, test,'prob')
rfmod_scored <- cbind(test, rfmod_pred, rfmod_prob)
#AUC/ROC
options(yardstick.event_first=FALSE)
#area under the curve
rf_auc = rfmod_prob %>%
yardstick::roc_auc(truth=test$BlueClass, Yes)
rf_auc
#ROC curve + plot
ROC_curve4<-rfmod_prob %>%
yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
ROC_curve_plot4 <- ROC_curve4 %>%
ggplot(aes(x=one_minus_specificity,y=sensitivity))+
geom_line() + geom_point() +
geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
xlab("one_minus_specificity\n(false positive rate)")+
ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot4)
#set new threshold
rfmod_pred2 <- RF$pred %>%
#the accuracy doesn't improve by reducing the threshold any further than .50, 99.7% best.
mutate(prediction = ifelse(Yes>.50, 'Yes', 'No')) %>%
mutate(prediction = factor(prediction, levels=c('No','Yes')))
#new threshold matrix
confusionMatrix(rfmod_pred2$prediction, rfmod_pred2$obs, positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9650 25
## Yes 16 309
##
## Accuracy : 0.9959
## 95% CI : (0.9944, 0.9971)
## No Information Rate : 0.9666
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9357
##
## Mcnemar's Test P-Value : 0.2115
##
## Sensitivity : 0.9251
## Specificity : 0.9983
## Pos Pred Value : 0.9508
## Neg Pred Value : 0.9974
## Prevalence : 0.0334
## Detection Rate : 0.0309
## Detection Prevalence : 0.0325
## Balanced Accuracy : 0.9617
##
## 'Positive' Class : Yes
##
#Choosing tuning parameters:
#Linear
set.seed(1)
tune.out.linear=tune(svm,BlueClass~Red+Green+Blue,data=train,kernel="linear",ranges=list(cost=c(0.001, 0.01, 0.1, 1,5,10,100)))
#Radial
tune.out.radial=tune(svm, BlueClass~Red+Green+Blue,data=train, kernel="radial", ranges=list(cost=c(0.1,1,10,100,1000),gamma=c(0.5,1,2,3,4)))
#lowest error is radial cost 10 gamma 1
#Poly
tune.out.poly=tune(svm,BlueClass~Red+Green+Blue,data=train, kernel="polynomial", ranges=list(cost=c(0.1,1,10,100,1000),degree=c(1,2,3,4,5)))
##------
system.time({
svmmod <- train(BlueClass~Red+Green+Blue,
data=train,
method = 'svmRadial',
metric = 'Accuracy',
trControl = train_control,
cost = 10,
gamma = 1,
preProcess = c("center","scale")
)
})
## user system elapsed
## 23.185 0.115 37.997
#Set prediction, probability, and cv score variables in case needed
svmmod_pred <- predict(svmmod, test,'raw')
svmmod_prob <- predict(svmmod, test,'prob')
svmmod_scored <- cbind(test, svmmod_pred, svmmod_prob)
#AUC/ROC
options(yardstick.event_first=FALSE)
#area under the curve
svmmod_auc = svmmod_prob %>%
yardstick::roc_auc(truth=test$BlueClass, Yes)
svmmod_auc
#ROC curve + plot
ROC_curve4<-svmmod_prob %>%
yardstick::roc_curve(truth=test$BlueClass,estimate=Yes) %>%
dplyr::mutate(one_minus_specificity = 1-specificity)
ROC_curve_plot4 <- ROC_curve4 %>%
ggplot(aes(x=one_minus_specificity,y=sensitivity))+
geom_line() + geom_point() +
geom_abline(slope = 1,intercept = 0, linetype='dashed',color='blue')+
xlab("one_minus_specificity\n(false positive rate)")+
ggtitle('LDA ROC curve')
ggplotly(ROC_curve_plot4)
#set new threshold
svmmod_pred2 <- svmmod$pred %>%
#the accuracy doesn't improve by reducing the threshold any further than .52, 99.7% best.
mutate(prediction = ifelse(Yes>.52, 'Yes', 'No')) %>%
mutate(prediction = factor(prediction, levels=c('No','Yes')))
#new threshold matrix
confusionMatrix(svmmod_pred2$prediction, svmmod_pred2$obs, positive="Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 9655 25
## Yes 11 309
##
## Accuracy : 0.9964
## 95% CI : (0.995, 0.9975)
## No Information Rate : 0.9666
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9431
##
## Mcnemar's Test P-Value : 0.03026
##
## Sensitivity : 0.9251
## Specificity : 0.9989
## Pos Pred Value : 0.9656
## Neg Pred Value : 0.9974
## Prevalence : 0.0334
## Detection Rate : 0.0309
## Detection Prevalence : 0.0320
## Balanced Accuracy : 0.9620
##
## 'Positive' Class : Yes
##
Ntree = 500 indicates that 500 trees were created. I understand that it is important for this number to be substantial to reduce variance, but as we know, that can also lead to bias. There should be a sweet spot where every input row gets predicted at least a few times without overfitting the trees. Mtry=5 indicates that 5 variables were randomly sampled at each split. When mtry=p it can essentially equate to bagging, whereas if its set to 1 it essentially chooses a random variable. I understand it is good to try out a few values that range no smaller than 2 and no larger than p. The gamma=1 svm parameter indicates that a single training example has far reaching influence. A cost = 10 here means we are "paying" a high price for higher accuracy.
I tried to make this selection process as programmatic as possible. There was a lot of learning and I'm sure there is still a healthy amount that should be corrected within the code but I was glad to find the resources that I did. For mtry I used the tuneRF() function that accepts an initial value for mtry and returns the out-of-bag error for your input value as well as a few surrounding values. I chose mtry because it produced the lowest oob error and went on to choose ntrees from there. At that point I included my mtry=3 into several RF models with ntrees of different values and again looked for the highest accuracy. Ntrees=500 was the winner in that sense, and it appears to be a very commonly used value for that parameter. For the SVM parameters I used our class lab as a guide to run the tune() function for linear, radial and polynomial kernels with several cost, gamma and degree values respectively. The tune function returns a best performer, which was the radial kernel with cost = 10 and gamma = 1. When running the train function the sigma = 8.691262 and C = 1 values were returned as contributing to the highest accuracy and therefore were the most optimal values for those parameters.
The best performing algorithm in the cross validation spectrum would be GLM or QDA. Both run in under 1.5 seconds to the user (up to ~15 seconds faster than some of the others) and still have an accuracy over 99.4%. The hold out data general ran very slowly, which could be due to the size of the dataset, but given the consistently high accuracy and easy of use of these two functions I would be even more likely to advocate for them in hold out setting.
Yes, of course compatible. Time is always a consideration but not quite as much of a factor in our cross validation. You can nit pick fractions of a second there but generally they run comparably fast. I realized how much that can vary when executing other sets of data, particularly when they become extremely large. 99%+ accuracy, depending on the target and industry, is generally good among the metrics we see. If I had to choose a “desert island” algorithm for training and testing, it would most likely be one of these for being good performers in many scenarios.
For the detection of blue tarps I’d recommend using the radial kernel svm function. Its valuable to tune your parameters according to your data and how you may define accuracy. Particularly with this pixel data, you could lower the threshold for determining a blue tarp knowing that it goes beyond the original tarp limits, but avoids the possibility of neglecting a human in need. The reason I became partial to these functions in the context of a natural disaster, is because it can give you a starting point for additional tuning. Accuracy and automation are two things I would assume are extremely valuable in the wake of a natural disaster.
Well I think its telling that I speak mostly about accuracy and time in the context of this project. I think that is a direct nod to the the context of an emergency situation. In a situation where time is not largely a factor or at all, perhaps I would be speaking much more often about sensitivity and the ability to use sensitivity and accuracy and the drivers for choosing an algorithm over the speed/accuracy trade off that a threatening situation can require.